This files contains an example of tuning an XGBoost model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.ClassifierSearchSpace(
data=X_train,
models=[hlp.sklearn_search.ClassifierSearchSpaceModels.XGBoost],
iterations=[50],
random_state=42,
)
# pip install scikit-optimize
from skopt import BayesSearchCV
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 139.040 seconds; 2.3 minutes
print(bayes_search.best_score_)
0.7705175280760079
print(bayes_search.best_params_)
OrderedDict([('model', XGBClassifier(base_score=None, booster=None,
colsample_bylevel=0.5213921375991398, colsample_bynode=None,
colsample_bytree=0.8892648282704134, enable_categorical=False,
eval_metric='logloss', gamma=None, gpu_id=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.0036325173225203837, max_delta_step=None,
max_depth=8, min_child_weight=2, missing=nan,
monotone_constraints=None, n_estimators=1341, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=42,
reg_alpha=0.39963322595869505, reg_lambda=1.8262863809878243,
scale_pos_weight=None, subsample=0.7998768156287402,
tree_method=None, use_label_encoder=False,
validate_parameters=None, verbosity=None)), ('model__colsample_bylevel', 0.5213921375991398), ('model__colsample_bytree', 0.8892648282704134), ('model__learning_rate', 0.0036325173225203837), ('model__max_depth', 8), ('model__min_child_weight', 2), ('model__n_estimators', 1341), ('model__reg_alpha', 0.39963322595869505), ('model__reg_lambda', 1.8262863809878243), ('model__subsample', 0.7998768156287402), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer(strategy='median')), ('prep__numeric__scaler__transformer', None)])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results.best_score
0.7705175280760079
results.best_params
{'model': 'XGBClassifier()',
'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243,
'imputer': "SimpleImputer(strategy='median')",
'scaler': 'None',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100)
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.771 | 0.744 | 0.797 | 8.000 | 0.004 | 1,341.000 | 2.000 | 0.800 | 0.889 | 0.521 | 0.400 | 1.826 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.764 | 0.737 | 0.791 | 50.000 | 0.000 | 100.000 | 1.000 | 0.843 | 0.500 | 0.500 | 1.000 | 1.000 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.763 | 0.737 | 0.789 | 12.000 | 0.000 | 2,000.000 | 1.000 | 0.500 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.762 | 0.744 | 0.780 | 38.000 | 0.016 | 104.000 | 2.000 | 0.505 | 0.547 | 0.555 | 0.296 | 1.623 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.760 | 0.729 | 0.792 | 13.000 | 0.000 | 103.000 | 1.000 | 0.761 | 0.500 | 0.500 | 1.000 | 1.000 | SimpleImputer() | OneHotEncoder() |
| 0.760 | 0.741 | 0.779 | 17.000 | 0.000 | 100.000 | 1.000 | 0.618 | 0.500 | 0.500 | 1.000 | 3.002 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.760 | 0.726 | 0.794 | 9.000 | 0.000 | 100.000 | 1.000 | 0.872 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.757 | 0.737 | 0.778 | 50.000 | 0.000 | 100.000 | 1.000 | 0.500 | 0.558 | 0.660 | 0.001 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.757 | 0.730 | 0.785 | 6.000 | 0.000 | 291.000 | 3.000 | 0.735 | 0.500 | 0.500 | 1.000 | 2.321 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.755 | 0.731 | 0.779 | 50.000 | 0.000 | 100.000 | 1.000 | 0.691 | 0.880 | 0.500 | 0.002 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.754 | 0.739 | 0.770 | 50.000 | 0.000 | 100.000 | 2.000 | 0.757 | 0.500 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.754 | 0.733 | 0.776 | 1.000 | 0.500 | 275.000 | 12.000 | 1.000 | 0.500 | 1.000 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.753 | 0.725 | 0.782 | 50.000 | 0.000 | 847.000 | 4.000 | 0.500 | 0.500 | 0.500 | 1.000 | 3.841 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.751 | 0.722 | 0.780 | 50.000 | 0.000 | 2,000.000 | 2.000 | 0.500 | 1.000 | 0.500 | 1.000 | 1.689 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.749 | 0.729 | 0.770 | 3.000 | 0.000 | 1,050.000 | 2.000 | 0.606 | 0.720 | 0.539 | 0.001 | 2.732 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.749 | 0.721 | 0.778 | 1.000 | 0.174 | 100.000 | 15.000 | 0.954 | 0.745 | 0.926 | 0.601 | 3.354 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.748 | 0.719 | 0.777 | 50.000 | 0.000 | 2,000.000 | 1.000 | 0.580 | 0.883 | 1.000 | 0.024 | 2.547 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.746 | 0.719 | 0.772 | 3.000 | 0.000 | 141.000 | 1.000 | 0.641 | 0.663 | 0.679 | 0.772 | 3.636 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.745 | 0.720 | 0.770 | 50.000 | 0.000 | 100.000 | 1.000 | 1.000 | 1.000 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.743 | 0.730 | 0.755 | 50.000 | 0.000 | 2,000.000 | 2.000 | 0.794 | 1.000 | 0.500 | 1.000 | 1.000 | SimpleImputer() | OneHotEncoder() |
| 0.742 | 0.715 | 0.768 | 2.000 | 0.217 | 159.000 | 6.000 | 0.987 | 0.552 | 0.959 | 0.101 | 2.165 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.739 | 0.713 | 0.765 | 10.000 | 0.009 | 100.000 | 8.000 | 0.986 | 0.906 | 0.900 | 0.015 | 2.603 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.737 | 0.722 | 0.752 | 1.000 | 0.500 | 915.000 | 24.000 | 1.000 | 1.000 | 0.500 | 1.000 | 2.795 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.737 | 0.711 | 0.763 | 50.000 | 0.000 | 2,000.000 | 1.000 | 1.000 | 1.000 | 0.500 | 0.000 | 4.000 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.736 | 0.711 | 0.761 | 50.000 | 0.000 | 100.000 | 1.000 | 0.772 | 1.000 | 1.000 | 0.001 | 1.262 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.735 | 0.716 | 0.754 | 1.000 | 0.000 | 100.000 | 1.000 | 0.500 | 0.500 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.734 | 0.715 | 0.753 | 7.000 | 0.154 | 334.000 | 5.000 | 0.914 | 0.980 | 0.654 | 0.001 | 1.338 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.734 | 0.700 | 0.767 | 23.000 | 0.094 | 396.000 | 4.000 | 0.779 | 0.582 | 0.970 | 0.014 | 2.970 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.733 | 0.707 | 0.758 | 1.000 | 0.000 | 100.000 | 1.000 | 0.799 | 0.500 | 0.500 | 0.000 | 4.000 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.732 | 0.705 | 0.760 | 1.000 | 0.000 | 2,000.000 | 3.000 | 0.552 | 1.000 | 0.627 | 0.017 | 3.540 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.731 | 0.702 | 0.759 | 8.000 | 0.401 | 962.000 | 1.000 | 0.693 | 0.799 | 0.586 | 0.309 | 3.540 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.730 | 0.709 | 0.752 | 10.000 | 0.127 | 255.000 | 3.000 | 0.907 | 0.871 | 0.909 | 0.000 | 3.453 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.730 | 0.714 | 0.745 | 4.000 | 0.419 | 553.000 | 1.000 | 0.913 | 0.748 | 0.960 | 0.000 | 1.707 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.729 | 0.703 | 0.755 | 50.000 | 0.190 | 100.000 | 5.000 | 0.685 | 0.924 | 0.500 | 0.000 | 1.090 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.726 | 0.700 | 0.752 | 50.000 | 0.000 | 182.000 | 14.000 | 1.000 | 0.977 | 1.000 | 0.013 | 2.401 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.725 | 0.702 | 0.748 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | OneHotEncoder() |
| 0.725 | 0.695 | 0.755 | 50.000 | 0.000 | 782.000 | 50.000 | 0.786 | 0.500 | 0.500 | 1.000 | 3.817 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.723 | 0.699 | 0.748 | 1.000 | 0.000 | 100.000 | 8.000 | 1.000 | 0.679 | 0.638 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.723 | 0.697 | 0.749 | 1.000 | 0.283 | 1,422.000 | 2.000 | 0.965 | 0.723 | 0.851 | 0.000 | 2.502 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.720 | 0.696 | 0.743 | 4.000 | 0.208 | 100.000 | 2.000 | 0.588 | 0.936 | 0.850 | 0.003 | 2.280 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.718 | 0.689 | 0.748 | 50.000 | 0.500 | 797.000 | 1.000 | 1.000 | 0.500 | 0.500 | 0.029 | 1.083 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.718 | 0.684 | 0.751 | 17.000 | 0.181 | 491.000 | 22.000 | 0.997 | 0.500 | 0.924 | 0.028 | 1.073 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.716 | 0.701 | 0.732 | 3.000 | 0.000 | 1,913.000 | 1.000 | 1.000 | 0.913 | 0.801 | 0.515 | 1.436 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.713 | 0.683 | 0.743 | 50.000 | 0.500 | 100.000 | 1.000 | 0.500 | 0.500 | 1.000 | 1.000 | 1.000 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.713 | 0.685 | 0.740 | 34.000 | 0.358 | 356.000 | 17.000 | 0.830 | 0.763 | 0.719 | 0.040 | 1.631 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.710 | 0.689 | 0.731 | 4.000 | 0.256 | 669.000 | 17.000 | 0.869 | 0.743 | 0.679 | 0.001 | 3.446 | SimpleImputer() | OneHotEncoder() |
| 0.706 | 0.680 | 0.731 | 18.000 | 0.470 | 236.000 | 9.000 | 0.992 | 0.881 | 0.575 | 0.482 | 1.248 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.694 | 0.670 | 0.718 | 1.000 | 0.000 | 100.000 | 50.000 | 1.000 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.691 | 0.661 | 0.720 | 50.000 | 0.500 | 100.000 | 50.000 | 1.000 | 1.000 | 0.500 | 0.000 | 4.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.689 | 0.661 | 0.717 | 10.000 | 0.437 | 698.000 | 10.000 | 0.817 | 0.680 | 0.888 | 0.005 | 2.766 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.500 | <NA> | <NA> | 1.000 | 0.000 | 100.000 | 50.000 | 0.587 | 0.907 | 0.932 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(query="`roc_auc Mean` > 0.5").show()
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
results.plot_performance_across_trials(size='learning_rate', color='encoder').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params(height=800)
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
color='encoder'
)
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='max_depth'
)
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='imputer'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30 | 0.770518 | 8.0 | 0.003633 | 1341.0 | 2.0 | 0.799877 | 0.889265 | 0.521392 | 0.399633 | 1.826286 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 31 | 0.764005 | 50.0 | 0.000100 | 100.0 | 1.0 | 0.843183 | 0.500000 | 0.500000 | 1.000000 | 1.000000 | SimpleImputer() | CustomOrdinalEncoder() |
| 40 | 0.762873 | 12.0 | 0.000100 | 2000.0 | 1.0 | 0.500000 | 0.500000 | 0.500000 | 1.000000 | 4.000000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 35 | 0.762193 | 38.0 | 0.016298 | 104.0 | 2.0 | 0.505500 | 0.546566 | 0.554556 | 0.295873 | 1.622947 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 39 | 0.760485 | 13.0 | 0.000100 | 103.0 | 1.0 | 0.761034 | 0.500000 | 0.500000 | 1.000000 | 1.000000 | SimpleImputer() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'max_depth': 'max_depth',
'learning_rate': 'learning_rate',
'n_estimators': 'n_estimators',
'min_child_weight': 'min_child_weight',
'subsample': 'subsample',
'colsample_bytree': 'colsample_bytree',
'colsample_bylevel': 'colsample_bylevel',
'reg_alpha': 'reg_alpha',
'reg_lambda': 'reg_lambda',
'imputer': 'imputer',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda + imputer + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.565
Model: OLS Adj. R-squared: 0.424
Method: Least Squares F-statistic: 4.001
Date: Sun, 13 Feb 2022 Prob (F-statistic): 0.000555
Time: 09:56:41 Log-Likelihood: 112.82
No. Observations: 50 AIC: -199.6
Df Residuals: 37 BIC: -174.8
Df Model: 12
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7796 0.039 20.139 0.000 0.701 0.858
imputer[T.SimpleImputer(strategy='median')] 0.0103 0.012 0.867 0.391 -0.014 0.034
imputer[T.SimpleImputer(strategy='most_frequent')] 0.0045 0.012 0.367 0.716 -0.020 0.030
encoder[T.OneHotEncoder()] -0.0119 0.010 -1.208 0.235 -0.032 0.008
max_depth 0.0002 0.000 0.950 0.348 -0.000 0.001
learning_rate -0.0180 0.026 -0.697 0.490 -0.070 0.034
n_estimators 4.981e-06 7.49e-06 0.665 0.510 -1.02e-05 2.02e-05
min_child_weight -0.0016 0.000 -4.464 0.000 -0.002 -0.001
subsample 0.0444 0.030 1.498 0.143 -0.016 0.104
colsample_bytree -0.0350 0.027 -1.284 0.207 -0.090 0.020
colsample_bylevel -0.0621 0.026 -2.420 0.021 -0.114 -0.010
reg_alpha -0.0047 0.010 -0.451 0.655 -0.026 0.016
reg_lambda -0.0023 0.005 -0.505 0.617 -0.011 0.007
==============================================================================
Omnibus: 34.620 Durbin-Watson: 1.389
Prob(Omnibus): 0.000 Jarque-Bera (JB): 109.773
Skew: -1.796 Prob(JB): 1.46e-24
Kurtosis: 9.308 Cond. No. 1.00e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'max_depth', 'learning_rate', 'n_estimators', 'min_child_weight', 'subsample', 'colsample_bytree', 'colsample_bylevel', 'reg_alpha', 'reg_lambda'] ['imputer', 'encoder']
| roc_auc_Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.034255 | -0.684633 | -0.692475 | 1.148963 | -0.480001 | 0.046517 | 0.752963 | -0.78909 | -0.030446 | -0.38202 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 1 | 0.863079 | 1.303928 | -0.711828 | -0.733215 | -0.554075 | 0.287777 | -1.197205 | -0.898255 | 1.294509 | -1.11942 | SimpleImputer() | CustomOrdinalEncoder() |
| 2 | 0.833322 | -0.495246 | -0.711828 | 2.148443 | -0.554075 | -1.624099 | -1.197205 | -0.898255 | 1.294509 | 1.55786 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 3 | 0.815466 | 0.735767 | -0.623087 | -0.727149 | -0.480001 | -1.593459 | -0.963918 | -0.619856 | -0.259435 | -0.563485 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 4 | 0.770559 | -0.4479 | -0.711828 | -0.728665 | -0.554075 | -0.169876 | -1.197205 | -0.898255 | 1.294509 | -1.11942 | SimpleImputer() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['min_child_weight'] = score_dataframe_transformed['min_child_weight'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['colsample_bylevel'] = score_dataframe_transformed['colsample_bylevel'].astype('float')
score_dataframe_transformed['reg_alpha'] = score_dataframe_transformed['reg_alpha'].astype('float')
score_dataframe_transformed['reg_lambda'] = score_dataframe_transformed['reg_lambda'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda + imputer + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.565
Model: OLS Adj. R-squared: 0.424
Method: Least Squares F-statistic: 4.001
Date: Sun, 13 Feb 2022 Prob (F-statistic): 0.000555
Time: 09:59:21 Log-Likelihood: -50.633
No. Observations: 50 AIC: 127.3
Df Residuals: 37 BIC: 152.1
Df Model: 12
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -0.0010 0.264 -0.004 0.997 -0.537 0.535
imputer[T.SimpleImputer(strategy='median')] 0.2710 0.312 0.867 0.391 -0.362 0.904
imputer[T.SimpleImputer(strategy='most_frequent')] 0.1191 0.324 0.367 0.716 -0.538 0.777
encoder[T.OneHotEncoder()] -0.3117 0.258 -1.208 0.235 -0.834 0.211
max_depth 0.1176 0.124 0.950 0.348 -0.133 0.368
learning_rate -0.0862 0.124 -0.697 0.490 -0.337 0.164
n_estimators 0.0863 0.130 0.665 0.510 -0.177 0.349
min_child_weight -0.5838 0.131 -4.464 0.000 -0.849 -0.319
subsample 0.2095 0.140 1.498 0.143 -0.074 0.493
colsample_bytree -0.1838 0.143 -1.284 0.207 -0.474 0.106
colsample_bylevel -0.3201 0.132 -2.420 0.021 -0.588 -0.052
reg_alpha -0.0554 0.123 -0.451 0.655 -0.304 0.193
reg_lambda -0.0672 0.133 -0.505 0.617 -0.337 0.203
==============================================================================
Omnibus: 34.620 Durbin-Watson: 1.389
Prob(Omnibus): 0.000 Jarque-Bera (JB): 109.773
Skew: -1.796 Prob(JB): 1.46e-24
Kurtosis: 9.308 Cond. No. 5.97
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | 0.271026 | 0.391334 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | 0.119119 | 0.715606 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | -0.311663 | 0.234535 | False |
| max_depth | max_depth | 0.117628 | 0.348107 | False |
| learning_rate | learning_rate | -0.086176 | 0.489880 | False |
| n_estimators | n_estimators | 0.086322 | 0.510311 | False |
| min_child_weight | min_child_weight | -0.583781 | 0.000073 | True |
| subsample | subsample | 0.209535 | 0.142671 | False |
| colsample_bytree | colsample_bytree | -0.183753 | 0.207071 | False |
| colsample_bylevel | colsample_bylevel | -0.320071 | 0.020559 | True |
| reg_alpha | reg_alpha | -0.055389 | 0.654637 | False |
| reg_lambda | reg_lambda | -0.067244 | 0.616657 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 6.768 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({results.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.